{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np \n",
    "import ast \n",
    "import itertools \n",
    "from sklearn.preprocessing import StandardScaler\n",
    "import csop2_helper as csop2\n",
    "import json\n",
    "from scipy.stats import pearsonr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from importlib import reload  \n",
    "csop2 = reload(csop2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reading data \n",
    "This notebook takes in raw and processed data from [Replication Data for: Task Complexity Moderates Group Synergy](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/RP2OCY)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_p1 = pd.read_csv(\"rounds_data_phase1_raw.csv\")\n",
    "df_p2 = pd.read_pickle(\"rounds_data_phase2_processed.pkl\").query(\"group_formation == 'real-group'\").reset_index(drop=True)\n",
    "df_players = pd.read_csv(\"players.csv\")\n",
    "\n",
    "df_p2.rename(columns={\"chatLength\":\"n_chats_round\"}, inplace=True)\n",
    "df_p2[\"workerIds\"] = [ast.literal_eval(x) for x in df_p2[\"workerIds\"]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate group composition features "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Number of females "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_n_female(df_players, set_workerIds):\n",
    "    return np.sum(df_players.query(\"workerId in @set_workerIds\")['gender'].values == 'female')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_p2[\"n_females\"] = df_p2.apply(lambda x: get_n_female(df_players, x.workerIds), axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Social perceptiveness "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_rme_mean_std(df_players, set_workerIds):\n",
    "    return np.mean(df_players.query(\"workerId in @set_workerIds\")['cumulativeScore_RME']), np.std(df_players.query(\"workerId in @set_workerIds\")['cumulativeScore_RME'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_p2[[\"rme_mean\", \"rme_std\"]] = df_p2.apply(lambda x: get_rme_mean_std(df_players, x.workerIds), axis=1, result_type=\"expand\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Phase 1 performance  \n",
    "* Get the mean and std of score/duration/efficiency for group composition analysis (as measured on \"Moderate\" complexity in Phase 1) \n",
    "* Get the max z-scored score/duration/efficiency to measure \"relative collective benefit\" (relative performance gain between group and best member, z-scored for comparison between Phases 1 and 2. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_p1_performance = (df_p1.query(\"complexity == 'Moderate'\").groupby(\"workerIds\")[['score', 'round_duration']].sum()\n",
    "                    .assign(p1_score = lambda x: x.score)\n",
    "                    .assign(p1_duration = lambda x: x.round_duration)\n",
    "                    .assign(p1_efficiency = lambda x: x.score / x.round_duration).reset_index()[['workerIds', 'p1_score', 'p1_duration', 'p1_efficiency']])\n",
    "\n",
    "scaler = StandardScaler()\n",
    "\n",
    "df_p1_performance[[\"p1_score_zscore_solo\", \"p1_duration_zscore_solo\", \"p1_efficiency_zscore_solo\"]] = scaler.fit_transform(df_p1_performance[[\"p1_score\", \"p1_duration\", \"p1_efficiency\"]])\n",
    "\n",
    "def get_p1_performance_mean_std(df_p1_performance, set_workerIds, metric):\n",
    "    assert metric in [\"score\", \"duration\", \"efficiency\"]\n",
    "    return np.mean(df_p1_performance.query(\"workerIds in @set_workerIds\")['p1_{}'.format(metric)]), np.std(df_p1_performance.query(\"workerIds in @set_workerIds\")['p1_{}'.format(metric)])\n",
    "\n",
    "def get_p1_performance_max(df_p1_performance, set_workerIds, metric):\n",
    "    assert metric in [\"score\", \"duration\", \"efficiency\"]\n",
    "    return np.max(df_p1_performance.query(\"workerIds in @set_workerIds\")['p1_{}_zscore_solo'.format(metric)])\n",
    "\n",
    "for metric in [\"score\", \"duration\", \"efficiency\"]:\n",
    "    df_p2[['p1_{}_mean'.format(metric),'p1_{}_std'.format(metric)]] = df_p2.apply(lambda x: get_p1_performance_mean_std(df_p1_performance, x.workerIds, metric=metric), axis=1, result_type=\"expand\")\n",
    "    df_p2[f\"p1_{metric}_solozscore_max\"] = df_p2.apply(lambda x: get_p1_performance_max(df_p1_performance, x.workerIds, metric=metric), axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Survey questions \n",
    "* Binarize group answers to cognitive style survey into whether they all share the same style for that dimension or not "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_cogstyle_diversity(df_players, set_workerIds, survey_num):\n",
    "    assert survey_num in [1,2,3]\n",
    "    return len(set(df_players.query(\"workerId in @set_workerIds\")[\"step1_strategyQ{}\".format(survey_num)])) != 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "for survey_num in [1,2,3]:\n",
    "    df_p2[\"cogstyle_diversity_q{}\".format(survey_num)] = df_p2.apply(lambda x: get_cogstyle_diversity(df_players, x.workerIds, survey_num), axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Total messages "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_p2 = df_p2.merge(df_p2.groupby(\"game_id\")[\"n_chats_round\"].sum().rename(\"n_chats_total\"), on=\"game_id\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Turn-taking "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_consolidated_actions(df_round_data):\n",
    "    temp_turn_count = 1\n",
    "    consolidated_actions = []\n",
    "    \n",
    "    for turn_index in range(1,len(df_round_data)):\n",
    "        if df_round_data.subject_id.values[turn_index] == df_round_data.subject_id.values[turn_index-1]:\n",
    "            temp_turn_count += 1\n",
    "        else:\n",
    "            consolidated_actions.append((df_round_data.subject_id.values[turn_index-1], temp_turn_count))\n",
    "            temp_turn_count = 1\n",
    "\n",
    "        if turn_index == max(range(1,len(df_round_data))):\n",
    "            consolidated_actions.append((df_round_data.subject_id.values[turn_index], temp_turn_count))\n",
    "\n",
    "    #Checksum\n",
    "    assert sum([x[1] for x in consolidated_actions]) == len(df_round_data)\n",
    "\n",
    "    df_consolidated_actions = pd.DataFrame(columns=[\"subject_id\", \"num_moves\"], data=consolidated_actions)\n",
    "    \n",
    "    return df_consolidated_actions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/mosobay/opt/anaconda3/envs/py38/lib/python3.8/site-packages/scipy/stats/stats.py:1021: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  return a.std(axis, ddof=ddof) / a.mean(axis)\n",
      "/Users/mosobay/Dropbox (MIT)/research/csop2/dataverse/csop2_helper.py:17: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  rmad = mad/np.mean(x)\n"
     ]
    }
   ],
   "source": [
    "df_p2[\"consolidated_actions\"] = df_p2.apply(lambda x: get_consolidated_actions(x.round_data), axis=1)\n",
    "df_p2['turn_taking_index'] = (df_p2['consolidated_actions'].map(lambda x: len(x)) - 1) / (df_p2['ROUNDFEAT_SOLNS_num_inter_soln'] - 1)\n",
    "df_p2[\"turn_taking_coefvar\"] = df_p2.apply(lambda x: csop2.get_turn_taking_variation(x.consolidated_actions), axis=1)\n",
    "df_p2[\"turn_taking_gini\"] = df_p2.apply(lambda x: csop2.get_turn_taking_gini(x.consolidated_actions), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/mosobay/Dropbox (MIT)/research/csop2/dataverse/csop2_helper.py:17: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  rmad = mad/np.mean(x)\n"
     ]
    }
   ],
   "source": [
    "df_p2[\"int_solns_gini\"] = df_p2.apply(lambda x: csop2.get_turn_taking_gini(x.round_data), axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Standardize and export "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "outcomes = [\"score\", \"round_duration\", \"efficiency\"]\n",
    "\n",
    "team_level_features = [\"n_females\", \"rme_mean\", \"rme_std\",\n",
    "                       \"p1_score_mean\", \"p1_score_std\", \"p1_duration_mean\", \"p1_duration_std\", \"p1_efficiency_mean\", \"p1_efficiency_std\",\n",
    "                       \"cogstyle_diversity_q1\", \"cogstyle_diversity_q2\", \"cogstyle_diversity_q3\",\n",
    "                       \"n_chats_total\"]\n",
    "\n",
    "round_level_features = [\"n_chats_round\", \"turn_taking_index\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Standardize Outcomes \n",
    "for complexity in csop2.COMPLEXITIES:\n",
    "    scaler = StandardScaler()\n",
    "    df_p2.loc[df_p2.complexity == complexity,[x+\"_zscore\" for x in outcomes]] = scaler.fit_transform(df_p2.loc[df_p2.complexity == complexity, outcomes])\n",
    "\n",
    "#Standardize round-level features \n",
    "for complexity in csop2.COMPLEXITIES:\n",
    "    scaler = StandardScaler()\n",
    "    df_p2.loc[df_p2.complexity == complexity,[x+\"_zscore\" for x in round_level_features]] = scaler.fit_transform(df_p2.loc[df_p2.complexity == complexity, round_level_features])\n",
    "\n",
    "\n",
    "#Standardize team-level features \n",
    "scaler = StandardScaler()\n",
    "scaler = scaler.fit(df_p2.drop_duplicates(\"game_id\")[team_level_features])\n",
    "df_p2.loc[:,[x+\"_zscore\" for x in team_level_features]] = scaler.transform(df_p2.loc[:, team_level_features])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "nominal_block                                             980\n",
       "max_soln_dist                                              62\n",
       "mean_soln_dist                                             62\n",
       "turn_taking_gini                                           13\n",
       "int_solns_gini                                             13\n",
       "turn_taking_coefvar                                        13\n",
       "zscore_time_to_first_step                                   0\n",
       "zscore_efficiency                                           0\n",
       "zscore_round_duration                                       0\n",
       "zscore_score                                                0\n",
       "intermediate_solution_pace                                  0\n",
       "normalized_best_score                                       0\n",
       "mean_soln_dist_fillna                                       0\n",
       "max_soln_dist_fillna                                        0\n",
       "zscore_time_from_first_step_to_best                         0\n",
       "zscore_time_from_best_to_final                              0\n",
       "solution_distances                                          0\n",
       "zscore_time_from_final_to_submit                            0\n",
       "zscore_ROUNDFEAT_SOLNS_num_inter_soln                       0\n",
       "first_complete_is_final                                     0\n",
       "game_id                                                     0\n",
       "time_from_best_to_final                                     0\n",
       "zscore_max_soln_dist_fillna                                 0\n",
       "ROUNDFEAT_SOLNS_index_first_complete                        0\n",
       "ROUNDFEAT_SCORES_highest_complete_score                     0\n",
       "ROUNDFEAT_SCORES_bool_submitted_highest_complete_score      0\n",
       "best_solution_timestamp                                     0\n",
       "best_solution_index                                         0\n",
       "round_start_timestamp                                       0\n",
       "first_step_timestamp                                        0\n",
       "first_complete_timestamp                                    0\n",
       "final_solution_timestamp                                    0\n",
       "time_to_first_step                                          0\n",
       "time_from_first_step_to_first_complete                      0\n",
       "time_from_first_complete_to_final                           0\n",
       "time_from_final_to_submit                                   0\n",
       "time_to_best_solution                                       0\n",
       "time_from_first_step_to_best                                0\n",
       "zscore_intermediate_solution_pace                           0\n",
       "n_females                                                   0\n",
       "zscore_normalized_best_score                                0\n",
       "p1_score_std_zscore                                         0\n",
       "n_chats_round_zscore                                        0\n",
       "turn_taking_index_zscore                                    0\n",
       "n_females_zscore                                            0\n",
       "rme_mean_zscore                                             0\n",
       "rme_std_zscore                                              0\n",
       "p1_score_mean_zscore                                        0\n",
       "p1_duration_mean_zscore                                     0\n",
       "turn_taking_index                                           0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Data completeness check \n",
    "df_p2.isnull().sum().sort_values(ascending=False)[:50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert df_p2.shape[0] == 196*5\n",
    "df_p2.to_pickle(\"./phase_2_within_sample_processed.pkl\")\n",
    "df_p2.to_csv(\"./phase_2_within_sample_processed.csv\", index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py38",
   "language": "python",
   "name": "py38"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
